%matplotlib ipympl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from datetime import timedelta
def load_try_encodings(file: str):
    try:
        return pd.read_csv(file, delimiter="|", encoding="utf-8")
    except UnicodeDecodeError:
        return pd.read_csv(file, delimiter="|", encoding="ISO-8859-1")
    
def load_year(voter_history: str, voter_list: str,  ):
    history = load_try_encodings(voter_history)
    vlist = load_try_encodings(voter_list)
    #     history = pd.read_csv(voter_history, delimiter="|", encoding="utf-8")
    # except UnicodeDecodeError:
    #     history = pd.read_csv(voter_history, delimiter="|", encoding="ISO-8859-1")
    # vlist = pd.read_csv(voter_list, delimiter="|", encoding = "ISO-8859-1")
    # history = pd.read_csv("11-2-21 Voter History 49ANP_239643.txt", delimiter="|")
    # vlist = pd.read_csv("49VOT_238743 Nov 2021 election.txt", delimiter="|", encoding = "ISO-8859-1")
    history.rename({c:c.strip() for c in history.columns}, inplace=True, axis=1)
    vlist.rename({c:c.strip() for c in vlist.columns}, inplace=True, axis=1)

    # manual corrections
    # 
    history = history.set_index("Voter ID Number")
    vlist = vlist.set_index("Voter ID Number")
    drop_ids = [
        # erroneously entered reg with birth year of 1191
        # re-registered with different id the same year
        "02CLS2791002", 
        # 2015 birthdays that are unclear how to correct
        "01GSR0112000", #    01/01/1812
        "01MCN0112006", #    01/01/1812
        "01ANE0112001", #    01/01/1812
        "01WXO0109000", #    01/01/1809

        # 2013 birthdays that unclear how to correct
        "01ACE0108001", #    01/01/1808
        "01SEN0108008", #    01/01/1808
        "01HPL0108001", #    01/01/1808
        "01RKL0108001", #    01/01/1808
        "01ARA0108004", #    01/01/1808
        "01MRT0108003", #    01/01/1808
        "01DOA0107001", #    01/01/1807
        "01CBN0108003", #    01/01/1808
        "01BRD0108006", #    01/01/1808
        "01PAN0107002", #    01/01/1807
        "01VAY0105002", #    01/01/1805
        "01LNN0107002", #    01/01/1807
        "01MJA0107011", #    01/01/1807

        # 2012 Birthdays unclear how to correct
        "09GAA0487001", #    09/04/1487

        
        # 2011 birthday that unclear how to correct
        "01QLO0105000", #   01/01/1805
        "01KAA0106007", #   01/01/1806
        "01LSH0108007", #   01/01/1808
        "01KJN0111026", #   01/01/1811
        "01AMN0108001", #   01/01/1808
    ]
    vlist = vlist.drop(drop_ids, errors = 'ignore')
    history = history.drop(drop_ids, errors='ignore')

    # 04WDA0180001 in 2011
    # 12GDA0186001
    # 01DML1591001
    # 08WJB0481002
    # I modified the birth date year from 0980 to 1980
    # if "04WDA0180001" in vlist:
    #     # correcting a date entered as 0980
    #     print('here?')
    #     vlist["04WDA0180001"]["Date of Birth"] = "04/01/1980"
    birth_dates = pd.to_datetime(vlist['Date of Birth'])
    # extract from the file
    # this will break if multiple elec in same year
    elec_date = pd.to_datetime(history['Election Date'].iloc[0])
    # # check that this is actually correct ideally
    age = (elec_date - birth_dates) // timedelta(days=365.2425)
    vlist['age'] = age
    vlist['voted'] = False
    vlist.loc[vlist.index.intersection(history.index), 'voted'] = True
    return vlist
voters_2022 = load_year("11-8-22 Voter History 49ANP_269498.txt", "49VOT_267488 nov 2022 voting list.txt")
voters_2020 = load_year("11-3-20 Voter History 49ANP_225530.txt", "49VOT_224084 november 2020 election.txt")
voters_2018 = load_year("11-6-18 Voter History 49ANP_162771.txt", "49VOT_162354 - Nov 6 2018 Election.txt")
voters_2016 = load_year("11-8-16 Voter History 49ANP_140283.txt", "49VOT_139226 - Nov 8 2016.txt")
voters_2014 = load_year("11-4-2014 State Election 49ANP_120872.txt", "49VOT_120372 - nov 4 2014.txt")
voters_2012 = load_year("11.6.2012 StatePres 49ANP_103892.txt", "49VOT_103340 Nov 2012 election.txt")

voters_2021 = load_year("11-2-21 Voter History 49ANP_239643.txt", "49VOT_238743 Nov 2021 election.txt")
voters_2019 = load_year("11-5-19 Voter History 49ANP_202374.txt", "49VOT_199524 - Nov 2019 election.txt")
voters_2017 = load_year("11.7.17 Voter History 49ANP_150723.txt", "49VOT_150177 - nov 7 2017.txt")
voters_2015 = load_year("11.3.15 Voter History 49ANP_129528.txt", "49VOT_128567 - Nov 3, 2015.txt")
voters_2013 = load_year("11.5.13 Municipal Election 49ANP_112159.txt", "49VOT_111500 - nov 5, 2013.txt")
voters_2011 = load_year("11.8.2011 Voter History 49ANP_91255.txt", "49VOT_90931 Nov 2011 election.txt")
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[3], line 1
----> 1 voters_2022 = load_year("11-8-22 Voter History 49ANP_269498.txt", "49VOT_267488 nov 2022 voting list.txt")
      2 voters_2020 = load_year("11-3-20 Voter History 49ANP_225530.txt", "49VOT_224084 november 2020 election.txt")
      3 voters_2018 = load_year("11-6-18 Voter History 49ANP_162771.txt", "49VOT_162354 - Nov 6 2018 Election.txt")

Cell In[2], line 8, in load_year(voter_history, voter_list)
      7 def load_year(voter_history: str, voter_list: str,  ):
----> 8     history = load_try_encodings(voter_history)
      9     vlist = load_try_encodings(voter_list)
     10     #     history = pd.read_csv(voter_history, delimiter="|", encoding="utf-8")
     11     # except UnicodeDecodeError:
     12     #     history = pd.read_csv(voter_history, delimiter="|", encoding="ISO-8859-1")
     13     # vlist = pd.read_csv(voter_list, delimiter="|", encoding = "ISO-8859-1")
     14     # history = pd.read_csv("11-2-21 Voter History 49ANP_239643.txt", delimiter="|")
     15     # vlist = pd.read_csv("49VOT_238743 Nov 2021 election.txt", delimiter="|", encoding = "ISO-8859-1")

Cell In[2], line 3, in load_try_encodings(file)
      1 def load_try_encodings(file: str):
      2     try:
----> 3         return pd.read_csv(file, delimiter="|", encoding="utf-8")
      4     except UnicodeDecodeError:
      5         return pd.read_csv(file, delimiter="|", encoding="ISO-8859-1")

File ~/mambaforge/envs/voters/lib/python3.11/site-packages/pandas/io/parsers/readers.py:948, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
    935 kwds_defaults = _refine_defaults_read(
    936     dialect,
    937     delimiter,
   (...)
    944     dtype_backend=dtype_backend,
    945 )
    946 kwds.update(kwds_defaults)
--> 948 return _read(filepath_or_buffer, kwds)

File ~/mambaforge/envs/voters/lib/python3.11/site-packages/pandas/io/parsers/readers.py:611, in _read(filepath_or_buffer, kwds)
    608 _validate_names(kwds.get("names", None))
    610 # Create the parser.
--> 611 parser = TextFileReader(filepath_or_buffer, **kwds)
    613 if chunksize or iterator:
    614     return parser

File ~/mambaforge/envs/voters/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1448, in TextFileReader.__init__(self, f, engine, **kwds)
   1445     self.options["has_index_names"] = kwds["has_index_names"]
   1447 self.handles: IOHandles | None = None
-> 1448 self._engine = self._make_engine(f, self.engine)

File ~/mambaforge/envs/voters/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1705, in TextFileReader._make_engine(self, f, engine)
   1703     if "b" not in mode:
   1704         mode += "b"
-> 1705 self.handles = get_handle(
   1706     f,
   1707     mode,
   1708     encoding=self.options.get("encoding", None),
   1709     compression=self.options.get("compression", None),
   1710     memory_map=self.options.get("memory_map", False),
   1711     is_text=is_text,
   1712     errors=self.options.get("encoding_errors", "strict"),
   1713     storage_options=self.options.get("storage_options", None),
   1714 )
   1715 assert self.handles is not None
   1716 f = self.handles.handle

File ~/mambaforge/envs/voters/lib/python3.11/site-packages/pandas/io/common.py:872, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    863         handle = open(
    864             handle,
    865             ioargs.mode,
   (...)
    868             newline="",
    869         )
    870     else:
    871         # Binary mode
--> 872         handle = open(handle, ioargs.mode)
    873     handles.append(handle)
    875 # Convert BytesIO or file objects passed with an encoding

FileNotFoundError: [Errno 2] No such file or directory: '11-8-22 Voter History 49ANP_269498.txt'
col_order = [
       'Last Name', 
       'Middle Name',
       'First Name', 
       'voted', 
       'age',
       'Date of Birth',
       'Date of Registration', 
       'Residential Address Street Number',
       'Residential Address Street Name',
       'univ_housing_name',
       'Residential Address Street Suffix',
       'Residential Address Apartment Number', 
       'Residential Address Zip Code',
       # 'Mailing Address - Street Number and Name', 
       # 'Mailing Address - Apartment Number',
       # 'Mailing Address - City or Town',
       # 'Mailing Address - State', 
       # 'Mailing Address - Zip Code',
       'Gender F/M', 
       'Voter Status', 
       'Party Affiliation', 
       'Ward Number', 
       'Precinct Number',
       'Congressional District Number', 
       'Senatorial District Number',
       'State Representative District', 
       # 'Unnamed: 26', 
       # 'Unnamed: 25',
       #  'Record Sequence Number', 
       # 'Title',
]
voters = pd.concat(
    [
        voters_2011,
        voters_2012,
        voters_2013,
        voters_2014,
        voters_2015,
        voters_2016,
        voters_2017,
        voters_2018,
        voters_2019,
        voters_2020,
        voters_2021,
        voters_2022,
    ],
    keys = np.arange(2011, 2023)
)

# voters['Mailing Address ¿ Street Number and Name'] = voters['Mailing Address ¿ Street Number and Name'].combine_first(voters['Mailing Address ? Street Number and Name'])
# voters = voters.drop('Mailing Address ? Street Number and Name',axis=1)
# voters = voters.rename({'Mailing Address ¿ Street Number and Name': 'Mailing Address - Street Number and Name'})

# add a university housing name column
# makes it easier to do things like groupby for MIT dorms
voters['univ_housing_name'] = "NA"

voters = voters[col_order]
voters.index = voters.index.set_names(["year", "Voter ID Number"])
voters
def turnout_by_year_key(df, key, binn):
    """
    Calculate turnout per year based on the variable *key*.

    Parameters
    ----------
    df : pd.DataFrame
        Expected to have an outer (multi)index of *year*
    key : str
        The column to use for value_counts. e.g. "age"

    Returns
    -------
    pd.DataFrame
    """
    def _process_year(df):
        voted_counts = df[df['voted']][key].value_counts().sort_index()
        reg_counts = df[key].value_counts().sort_index()
        df = pd.DataFrame({"voted":voted_counts, "registered":reg_counts})
        return df.fillna(0).astype(int)
    years=  voters.index.unique(level=0)
    out = pd.concat([_process_year(df.loc[year]) for year in years], keys=years)
    out.index = out.index.set_names(["year", key])
    out['turnout'] = out['voted']/out['registered']
    return out

df = turnout_by_year_key(voters, "age")
df
grouped = df.reset_index()
age_groups = pd.cut(grouped['age'], np.arange(18, 114, 4), include_lowest=True)
grouped['age_group'] = age_groups
grouped = grouped.groupby(["year", "age_group"]).sum().sort_index().drop("age", axis=1).reset_index()
mid_points = [g.mid for g in grouped['age_group']]
grouped['mid_points'] = mid_points # convenience for plotting down the line
# transforming the intervals into strings for easy using the multiindex
# this can't be the best way to do this :(
# this is lowkey awful
grouped['age_group'] = [f"{int(np.round(g.left))}-{int(g.right)}" for g in grouped['age_group']]

grouped.index = pd.MultiIndex.from_frame(grouped[['year', 'age_group']])
grouped = grouped.drop(['year', 'age_group'], axis=1)
grouped['turnout'] = grouped['voted'] / grouped['registered']
grouped

Turnout vs registration by age group (Municipal Elections)#

def turnout_bar_graph(df, ax=None):
    bar_width = 3.75
    if ax is None:
        ax = plt.gca()
    ax.bar(df['mid_points'], df['voted'], width = bar_width, color='tab:green', label="Voted")
    ax.bar(df['mid_points'], df['registered'] - df['voted'], bottom= df['voted'], width=bar_width, color="gray", label="Registered - did not vote")
    ax.set_xlim([17, 85])
# fig, axs = plt.subplots(2,3, constrained_layout=True, figsize=(12,6),sharex=True)
# turnout_bar_graph(grouped.loc[2021], ax=axs[0,0])
# axs[0,0].set_title("2021")
# turnout_bar_graph(grouped.loc[2019], ax=axs[0,1])
# axs[0,1].set_title("2019")
# turnout_bar_graph(grouped.loc[2017], ax=axs[0,2])
# axs[0,2].set_title("2017")
# turnout_bar_graph(grouped.loc[2015], ax=axs[1, 0])
# axs[1,0].set_title("2015")
# turnout_bar_graph(grouped.loc[2013], ax=axs[1, 1])
# axs[1,1].set_title("2013")
# turnout_bar_graph(grouped.loc[2011], ax=axs[1, 2])
# axs[1,2].set_title("2011")
# axs[1,1].set_xlabel("Age")
# plt.legend()


fig, axs = plt.subplots(3,3, figsize=(16,6), sharex=True, sharey=True)
# fig.suptitle("Harvard Grad Dorms")

years = np.arange(2011, 2023)
year = 2022
bar_width = .75
for i, ax in enumerate(axs.reshape(-1)): 
    year = years[-i-1]
    ax.set_title(f"{year}")
    turnout_bar_graph(grouped.loc[year], ax)
# ax.set_xticklabels([l.get_text().split()[0] for l in ax.get_xticklabels()])
fig.supxlabel("Age (4 year bins)")
ax.legend()
plt.tight_layout()
# plt.tight_layout()
ages = np.arange(18, 79, 4)
groups = [f"{i}-{i+4}" for i in ages]

df = grouped.loc[2022]
fig, ax = plt.subplots(figsize=(6, 14))
bar_width = 3.75
ax.barh(df['mid_points'], df['registered'], height=bar_width, color="gray", label="Registered")
ax.barh(df['mid_points'], df['voted'], height = bar_width, color='tab:green', label="Voted")
ax.set_yticks(ages+2,labels=groups,fontsize=20)
ax.legend(fontsize=25)
ax.set_ylim([85,17])
plt.tight_layout()
ages = np.arange(18, 75, 4)
groups = [f"{i}-{i+4}" for i in ages]
colors = plt.cm.viridis(ages/ages.max())

plt.figure()
for i, group in enumerate(groups):

    grouped['turnout'].xs(group, level=1).plot(label=group, style='o--', color=colors[i])
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.title("Turnout by Age and Year")
plt.ylabel("Turnout %")
plt.xlabel("Election Year")
plt.tight_layout()
plt.grid()

plt.figure()
for i, group in enumerate(groups):

    grouped['voted'].xs(group, level=1).plot(label=group, style='o--', color=colors[i])
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.title("Turnout by Age and Year")
plt.ylabel("Turnout Number")
plt.xlabel("Election Year")
plt.tight_layout()
plt.grid()
from cycler import cycler

years = np.arange(2011, 2023)
fig, axs = plt.subplots(1,2,figsize=(12,4.5))
axs[0].set_prop_cycle(cycler(color=plt.get_cmap("tab20").colors))
axs[1].set_prop_cycle(cycler(color=plt.get_cmap("tab20").colors))

for year in years:
    if year %2 == 0:
        alpha = .8
        linestyle='--'
    else:
        alpha = 1
        linestyle='-'
        
    axs[0].plot(grouped.loc[year]['mid_points'].values, grouped.loc[year]['turnout'].values*100, 'o', linestyle=linestyle, label=year, alpha=alpha)
    axs[1].plot(grouped.loc[year]['mid_points'].values, grouped.loc[year]['voted'].values, 'o',linestyle=linestyle, label=year, alpha=alpha)
axs[0].set_title("Turnout %")
axs[1].set_title("Turnout Numbers")
axs[0].set_ylabel("%")
axs[1].set_ylabel("Number of Votes Cast")
# axs[0].set_prop_cycle(cycler(color=plt.get_cmap('tab20c').color‌‌​​s))
# axs[1].set_prop_cycle(cycler(color=plt.get_cmap('tab20c').color‌‌​​s))
labelsize = 15
axs[0].grid()
axs[1].grid()
fig.supxlabel("Age (4 yr bins)", size=labelsize)
plt.legend()
plt.tight_layout()
fig, ax = plt.subplots()
ax.set_prop_cycle(cycler(color=plt.get_cmap("tab20").colors))
plt.title("Turnout vs Age")
plt.plot(grouped.loc[2021]['mid_points'].values, grouped.loc[2021]['turnout'].values, 'o--', label='2021', alpha=.8)
plt.plot(grouped.loc[2019]['mid_points'].values, grouped.loc[2019]['turnout'].values, 'o--', label='2019', alpha=.8)
plt.plot(grouped.loc[2017]['mid_points'].values, grouped.loc[2017]['turnout'].values, 'o--', label='2017', alpha=.8)
plt.plot(grouped.loc[2015]['mid_points'].values, grouped.loc[2015]['turnout'].values, 'o--', label='2015', alpha=.8)
plt.plot(grouped.loc[2013]['mid_points'].values, grouped.loc[2013]['turnout'].values, 'o--', label='2013', alpha=.8)
plt.plot(grouped.loc[2011]['mid_points'].values, grouped.loc[2011]['turnout'].values, 'o--', label='2011', alpha=.8)
plt.legend()
# plt.plot(grouped.loc[2021]['mid_points'].values, grouped.loc[2021]['turnout'].values, 'o--')
# plt.plot(grouped.loc[2021]['mid_points'].values, grouped.loc[2021]['turnout'].values, 'o--')
plt.xlabel("Age (4 yr bins)")
plt.ylabel("Turnout %")
plt.figure()
plt.title("Turnout vs Age")
plt.grid()
plt.plot(grouped.loc[2021]['mid_points'].values, grouped.loc[2021]['voted'].values, 'o--', label='2021', alpha=.8)
plt.plot(grouped.loc[2019]['mid_points'].values, grouped.loc[2019]['voted'].values, 'o--', label='2019', alpha=.8)
plt.plot(grouped.loc[2017]['mid_points'].values, grouped.loc[2017]['voted'].values, 'o--', label='2017', alpha=.8)
plt.plot(grouped.loc[2015]['mid_points'].values, grouped.loc[2015]['voted'].values, 'o--', label='2015', alpha=.8)
plt.plot(grouped.loc[2013]['mid_points'].values, grouped.loc[2013]['voted'].values, 'o--', label='2013', alpha=.8)
plt.plot(grouped.loc[2011]['mid_points'].values, grouped.loc[2011]['voted'].values, 'o--', label='2011', alpha=.8)
plt.legend()
# plt.plot(grouped.loc[2021]['mid_points'].values, grouped.loc[2021]['turnout'].values, 'o--')
# plt.plot(grouped.loc[2021]['mid_points'].values, grouped.loc[2021]['turnout'].values, 'o--')
plt.xlabel("Age (4 yr bins)")
plt.ylabel("Turnout (Vote count)")
plt.tight_layout()
plt.figure()
plt.title("Turnout vs Age")
plt.grid()
plt.plot(grouped.loc[2021]['mid_points'].values, grouped.loc[2021]['registered'].values, 'o--', label='2021', alpha=.8)
plt.plot(grouped.loc[2019]['mid_points'].values, grouped.loc[2019]['registered'].values, 'o--', label='2019', alpha=.8)
plt.plot(grouped.loc[2017]['mid_points'].values, grouped.loc[2017]['registered'].values, 'o--', label='2017', alpha=.8)
plt.plot(grouped.loc[2015]['mid_points'].values, grouped.loc[2015]['registered'].values, 'o--', label='2015', alpha=.8)
plt.plot(grouped.loc[2013]['mid_points'].values, grouped.loc[2013]['registered'].values, 'o--', label='2013', alpha=.8)
plt.plot(grouped.loc[2011]['mid_points'].values, grouped.loc[2011]['registered'].values, 'o--', label='2011', alpha=.8)
plt.legend()
# plt.plot(grouped.loc[2021]['mid_points'].values, grouped.loc[2021]['turnout'].values, 'o--')
# plt.plot(grouped.loc[2021]['mid_points'].values, grouped.loc[2021]['turnout'].values, 'o--')
plt.xlabel("Age (4 yr bins)")
plt.ylabel("Registered Voters")
plt.tight_layout()

University Housing/Dorms#

from collections import defaultdict
from collections.abc import Iterable
def find_housing_idxs(df: pd.DataFrame, housing_locations:dict):
    # make every street number a tuple for convenience
    building = dict(housing_locations)
    
    indices = defaultdict(lambda : np.zeros(len(df), dtype=bool))

    def _find_idx(street_num, street_name):
        idx = [street in street_name for street in df['Residential Address Street Name']]
        
        if street_num is not None:
            if not isinstance(street_num, Iterable):
                # turn single number addr into iterableto match places with multiple addresses
                street_num = (street_num, )
                # multiple valid street numbers
            idx = np.logical_and(idx, [num in street_num for num in df['Residential Address Street Number']])
                
        return idx
    for name, v in building.items():
        if isinstance(v, list):
            # complex with multiple addresses - e.g. holden green
            for addr in v:
                indices[name] |= _find_idx(addr[0], addr[1])
        else:
            indices[name] |= _find_idx(v[0], v[1])
    
    for name, idx in indices.items():
        df.loc[idx,'univ_housing_name'] = name
    indices['all']=np.any(list(indices.values()),axis=0)
    return indices

Harvard Grad Dorms#

gsas_dorms = ["richards hl", "perkins hl", "conant hl", "child hl"]
gsas_dorms = {name: (None, name.upper()) for name in gsas_dorms}
gsas_idx = find_housing_idxs(voters, gsas_dorms)
fig, axs = plt.subplots(3,3, figsize=(16,6), sharex=True, sharey=True)
fig.suptitle("Harvard Grad Dorms")

year = 2022
bar_width = .75
for i, ax in enumerate(axs.reshape(-1)): 
    year = years[-i-1]
    ax.set_title(f"{year}")
    university_housing_bar_chart(ax, year, gsas_idx['all'], voters)
ax.set_xticklabels([l.get_text().split()[0] for l in ax.get_xticklabels()])
ax.legend()
plt.tight_layout()
df = turnout_by_year_key(voters, "univ_housing_name")

# groupby is the easiest way i could see to get the order I wanted
# couldn't get the reorder levels to work properly :/
df = df.groupby(["univ_housing_name", "year"]).mean()
df
fig, axs = plt.subplots(1,3, figsize=(12,4.5),constrained_layout=True)
for dorm in gsas_dorms.keys():
    axs[0].plot(df.loc[dorm]['registered'], 'o--', label=dorm.split()[0])
    axs[0].set_title("Registered")
    axs[1].set_title("Voted")
    axs[2].set_title("Turnout %")
    axs[1].plot(df.loc[dorm]['voted'], 'o--', label=dorm.split()[0])
    axs[2].plot(df.loc[dorm]['turnout'], 'o--', label=dorm.split()[0])
plt.ylabel("Turnout %")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)

Harvard Law School#

hvd_law_dorms = ["dane hl", "ames hl", "shaw hl", "story hl", "holmes hl", "hastings hl", "north hl"]
hvd_law_dorms = {name: (None, name.upper()) for name in hvd_law_dorms}
hvd_law_idx = find_housing_idxs(voters, hvd_law_dorms)

fig, axs = plt.subplots(3,3, figsize=(16,6), sharex=True, sharey=True)
plt.suptitle("Harvard Law Dorms Turnout")

for i, ax in enumerate(axs.reshape(-1)): 
    year = years[-i-1]
    ax.set_title(f"{year}")
    university_housing_bar_chart(ax, year, hvd_law_idx['all'], voters)
    ax.tick_params(axis='x', labelrotation=90)
ax.set_xticklabels([l.get_text().split()[0] for l in ax.get_xticklabels()])
ax.legend()
plt.tight_layout()

df = turnout_by_year_key(voters, "univ_housing_name")

# groupby is the easiest way i could see to get the order I wanted
# couldn't get the reorder levels to work properly :/
df = df.groupby(["univ_housing_name", "year"]).mean()

fig, axs = plt.subplots(1,3, figsize=(12,4.5),constrained_layout=True)
for dorm in hvd_law_dorms.keys():
    axs[0].plot(df.loc[dorm]['registered'], 'o--', label=dorm.split()[0])
    axs[0].set_title("Registered")
    axs[1].set_title("Voted")
    axs[2].set_title("Turnout %")
    axs[1].plot(df.loc[dorm]['voted'], 'o--', label=dorm.split()[0])
    axs[2].plot(df.loc[dorm]['turnout'], 'o--', label=dorm.split()[0])
plt.ylabel("Turnout %")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)

Harvard Housing#

still missing some big ones. see map here: https://osa.gse.harvard.edu/files/gse-osa/files/hu_housing_map.pdf

would also be cool to normalize by how many units are in each building

harvard_housing = {
    "Peabody Terrace":(None, "PEABODY TER"),
    "Holden Green": [(None, "HOLDEN GRN"), (list(range(10, 38+2,2)), "HOLDEN ST")], # multiple address here. the func will handle this
    "29 Garden St": (29, "GARDEN ST"),
    "Botanic Gardens": (None, "FERNALD DR"),
    "Kirkland Court": ((37, 39, 31), "KIRKLAND ST"),
    "10 Akron": (10, "AKRON ST"),
    "Ware St" : ((9, 11, 13, 15, 17 ,19), "WARE ST"), # as it stands the function won't differentiate between 13 and 13A ware st so should pick up both
    "Prescott" : (list(range(85, 95+1,2)), "PRESCOTT ST") 
}

# TODO: haskins hall, beckwith cricle, terry terrace
harvard_housing_idx = find_housing_idxs(voters, harvard_housing)
fig, axs = plt.subplots(3,3, figsize=(17,8), sharex=True, sharey=True)
plt.suptitle("Harvard University Housing Turnout")

for i, ax in enumerate(axs.reshape(-1)): 
    year = years[-i-1]
    ax.set_title(f"{year}")
    university_housing_bar_chart(ax, year, harvard_housing_idx['all'], voters)
    ax.tick_params(axis='x', labelrotation=90)
ax.legend()
plt.tight_layout()


df = turnout_by_year_key(voters, "univ_housing_name")

# groupby is the easiest way i could see to get the order I wanted
# couldn't get the reorder levels to work properly :/
df = df.groupby(["univ_housing_name", "year"]).mean()

fig, axs = plt.subplots(1,3, figsize=(16,6),constrained_layout=True)
for dorm in harvard_housing.keys():
    axs[0].plot(df.loc[dorm]['registered'], 'o--', label=dorm.split()[0])
    axs[0].set_title("Registered")
    axs[1].set_title("Voted")
    axs[2].set_title("Turnout %")
    axs[1].plot(df.loc[dorm]['voted'], 'o--', label=dorm.split()[0])
    axs[2].plot(df.loc[dorm]['turnout'], 'o--', label=dorm)
plt.ylabel("Turnout %")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)

Undergrad houses#

harvard_ugrad_houses = [h+" House" for h in ["Leverett", "Pforzheimer", "Adams", "Currier", "Cabot", "Dunster", "Eliot", "Kirkland", "Lowell", "Mather", "Quincy", "Winthrop"]]
harvard_ugrad_houses = {name: (None, name.upper()) for name in harvard_ugrad_houses}
harvard_ugrad_idx = find_housing_idxs(voters, harvard_ugrad_houses)
fig, axs = plt.subplots(3,3, figsize=(16,6), sharex=True, sharey=True)

plt.suptitle("Harvard Undergrad house turnout")
year = 2022
bar_width = .75
for i, ax in enumerate(axs.reshape(-1)): 
    year = years[-i-1]
    ax.set_title(f"{year}")
    university_housing_bar_chart(ax, year, harvard_ugrad_idx['all'], voters)
    ax.tick_params(axis='x', labelrotation=90)
ax.set_xticklabels([l.get_text().split()[0] for l in ax.get_xticklabels()])
ax.legend()
plt.tight_layout()

df = turnout_by_year_key(voters, "univ_housing_name")

# groupby is the easiest way i could see to get the order I wanted
# couldn't get the reorder levels to work properly :/
df = df.groupby(["univ_housing_name", "year"]).mean()

fig, axs = plt.subplots(1,3, figsize=(16,6),constrained_layout=True)
for dorm in harvard_ugrad_houses.keys():
    axs[0].plot(df.loc[dorm]['registered'], 'o--', label=dorm.split()[0])
    axs[0].set_title("Registered")
    axs[1].set_title("Voted")
    axs[2].set_title("Turnout %")
    axs[1].plot(df.loc[dorm]['voted'], 'o--', label=dorm.split()[0])
    axs[2].plot(df.loc[dorm]['turnout'], 'o--', label=dorm)
plt.ylabel("Turnout %")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)

1st year#

harvard_1st_year = [h+" HL" for h in [
    "Greenough", 
    "Hurlbut",
    "Pennypacker",
    "Wigglesworth ",
    "Grays",
    "Matthews",
    "Weld",
    "Apley",
    "Hollis",
    "Holworthy",
    "Lionel",
    "Mass Hall",
    "Mower",
    "Stoughton",
    "Straus",
    "Canaday",
    "Thayer",
]]

harvard_1st_year = {name: (None, name.upper()) for name in harvard_1st_year}
harvard_1st_idx = find_housing_idxs(voters, harvard_1st_year)
fig, axs = plt.subplots(3,3, figsize=(16,6), sharex=True, sharey=True)

plt.suptitle("Harvard Undergrad house turnout")
year = 2022
bar_width = .75
for i, ax in enumerate(axs.reshape(-1)): 
    year = years[-i-1]
    ax.set_title(f"{year}")
    university_housing_bar_chart(ax, year, harvard_ugrad_idx['all'], voters)
    ax.tick_params(axis='x', labelrotation=90)
ax.set_xticklabels([l.get_text().split()[0] for l in ax.get_xticklabels()])
ax.legend()
plt.tight_layout()

df = turnout_by_year_key(voters, "univ_housing_name")

# groupby is the easiest way i could see to get the order I wanted
# couldn't get the reorder levels to work properly :/
df = df.groupby(["univ_housing_name", "year"]).mean()

fig, axs = plt.subplots(1,3, figsize=(16,6),constrained_layout=True)
for dorm in harvard_ugrad_houses.keys():
    axs[0].plot(df.loc[dorm]['registered'], 'o--', label=dorm.split()[0])
    axs[0].set_title("Registered")
    axs[1].set_title("Voted")
    axs[2].set_title("Turnout %")
    axs[1].plot(df.loc[dorm]['voted'], 'o--', label=dorm.split()[0])
    axs[2].plot(df.loc[dorm]['turnout'], 'o--', label=dorm)
plt.ylabel("Turnout %")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)

MIT#

Undergrad#

These dorms are recorded with real addresses rather than house names

mit_dorms = {
    "Baker House": (362, "MEMORIAL DR"),
    "Burton Conner":(410, "MEMORIAL DR"),
    "East Campus" :(3, "AMES ST"),
    "MacGregor House":(450, "MEMORIAL DR"),
    "Maseeh Hall":(305, "MEMORIAL DR"),
    "McCormick Hall" : (320, "MEMORIAL DR"),
    "New House" : (tuple(range(471, 476+1)), "MEMORIAL DR"),
    "Next House":(500, "MEMORIAL DR"),
    "New Vassar":(189, "VASSAR ST"),
    "Random Hall":(290, "MASSACHUSETTS AVE"),
    "Simmons Hall": (tuple(range(229, 243+1)),  "VASSAR ST")
    
}

mit_ugrad_idx = find_housing_idxs(voters, mit_dorms)
fig, axs = plt.subplots(3,3, figsize=(16,6), sharex=True, sharey=True)

year = 2022
plt.suptitle("MIT Undergrad Dorm Turnout over the years")
bar_width = .75
for i, ax in enumerate(axs.reshape(-1)): 
    year = years[-i-1]
    university_housing_bar_chart(ax, year, mit_ugrad_idx['all'], voters)
    ax.set_title(f"{year}")
    ax.tick_params(axis='x', labelrotation=90)
    
    ax.tick_params(axis='x', labelrotation=90)
# ax.set_xticklabels([l.get_text().split()[0] for l in ax.get_xticklabels()])
ax.legend()
plt.tight_layout()

df = turnout_by_year_key(voters, "univ_housing_name")

# groupby is the easiest way i could see to get the order I wanted
# couldn't get the reorder levels to work properly :/
df = df.groupby(["univ_housing_name", "year"]).mean()

fig, axs = plt.subplots(1,3, figsize=(16,6),constrained_layout=True)
for dorm in mit_dorms.keys():
    axs[0].plot(df.loc[dorm]['registered'], 'o--', label=dorm.split()[0])
    axs[0].set_title("Registered")
    axs[1].set_title("Voted")
    axs[2].set_title("Turnout %")
    axs[1].plot(df.loc[dorm]['voted'], 'o--', label=dorm)
    axs[2].plot(df.loc[dorm]['turnout'], 'o--', label=dorm)
plt.ylabel("Turnout %")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)

MIT Grad#

unlike harvard have a very convenient and easy to find table of places.

https://studentlife.mit.edu/grad-residences

mit_grad_housing = {
    "70 Amherst": (70, "AMHERST ST"),
    "Ashdown":(235, "ALBANY ST"),
    "Edgerton": (143, "ALBANY ST"),
    "Grad Tower": (45, "HAYWARD ST"),
    "Sidney Pacific": (70, "PACIFIC ST"),
    "Tang Hall" : (550, "MEMORIAL DR"),
    "The Warehouse": (224, "ALBANY ST"),
    "Westgate" : (540, "MEMORIAL DRIVE")
}
mit_grad_idx = find_housing_idxs(voters, mit_grad_housing)
fig, axs = plt.subplots(4,3, figsize=(16,6), sharex=True, sharey=True)

year = 2022
plt.suptitle("MIT Grad Housing Turnout")
bar_width = .75
for i, ax in enumerate(axs.reshape(-1)): 
    year = years[-i-1]
    university_housing_bar_chart(ax, year, mit_grad_idx['all'], voters)
    ax.set_title(f"{year}")
    ax.tick_params(axis='x', labelrotation=90)
    
    ax.tick_params(axis='x', labelrotation=90)
# ax.set_xticklabels([l.get_text().split()[0] for l in ax.get_xticklabels()])
ax.legend()
plt.tight_layout()

df = turnout_by_year_key(voters, "univ_housing_name")

# groupby is the easiest way i could see to get the order I wanted
# couldn't get the reorder levels to work properly :/
df = df.groupby(["univ_housing_name", "year"]).mean()

fig, axs = plt.subplots(1,3, figsize=(16,6),constrained_layout=True)
for dorm in mit_grad_housing.keys():
    axs[0].plot(df.loc[dorm]['registered'], 'o--', label=dorm.split()[0])
    axs[0].set_title("Registered")
    axs[1].set_title("Voted")
    axs[2].set_title("Turnout %")
    axs[1].plot(df.loc[dorm]['voted'], 'o--', label=dorm)
    axs[2].plot(df.loc[dorm]['turnout'], 'o--', label=dorm)
plt.ylabel("Turnout %")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)

Lesley#

todo

https://lesley.edu/students/housing/residence-halls

cool interactive map here: https://map.concept3d.com/?id=31#!ce/238?m/3276?s/

lesley_housing = {
    "Doble": (30, "MELLEN ST"),
    "Compass House": (14, "WENDELL ST"),
    "Everett House": (28, "WENDELL ST"),
    "Jenckes House": (31, "MELLEN ST"),
    "Kidder House": ((2,4), "SAINT JOHNS RD"),
    "[Lesley] Kirkland House": (61, "OXFORD ST"),
    "Kris House": (68, "OXFORD ST"),
    "Lawrence Hall": (99, "BRATTLE ST"),
    "MacKenzie Hall": (36, "MELLEN ST"),
    "Malloch Hall": (38, "MELLEN ST"),
    "Mellen House": (24, "MELLEN ST"),
    "Rousmaniere House": (6, "SAINT JOHNS RD"),
    "Wendell House": (63, "OXFORD ST"),
    "White Hall": (33, "EVERETT ST"),
    "Wilbur House": (78, "OXFORD ST"),
    "Wilson House": ((16,18), "WENDELL ST"),
    "Winthrop Hall": (list(range(1,7+1, 2)), "SAINT JOHNS RD"),
    "Wolfard Hall": (34, "MELLEN ST"),
    }
lesley_housing_idx = find_housing_idxs(voters, lesley_housing)
fig, axs = plt.subplots(4,3, figsize=(16,6), sharex=True, sharey=True)

year = 2022
plt.suptitle("Lesley Housing Turnout")
bar_width = .75
for i, ax in enumerate(axs.reshape(-1)): 
    year = years[-i-1]
    university_housing_bar_chart(ax, year, lesley_housing_idx['all'], voters)
    ax.set_title(f"{year}")
    ax.tick_params(axis='x', labelrotation=90)
    
    ax.tick_params(axis='x', labelrotation=90)
# ax.set_xticklabels([l.get_text().split()[0] for l in ax.get_xticklabels()])
ax.legend()
plt.tight_layout()

df = turnout_by_year_key(voters, "univ_housing_name")

# groupby is the easiest way i could see to get the order I wanted
# couldn't get the reorder levels to work properly :/
df = df.groupby(["univ_housing_name", "year"]).mean()

fig, axs = plt.subplots(1,3, figsize=(16,6),constrained_layout=True)
for dorm in lesley_housing.keys():
    axs[0].plot(df.loc[dorm]['registered'], 'o--', label=dorm.split()[0])
    axs[0].set_title("Registered")
    axs[1].set_title("Voted")
    axs[2].set_title("Turnout %")
    axs[1].plot(df.loc[dorm]['voted'], 'o--', label=dorm)
    axs[2].plot(df.loc[dorm]['turnout'], 'o--', label=dorm)
plt.ylabel("Turnout %")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
np.unique(voters[lesley_housing_idx['all']].loc[2012]['univ_housing_name'], return_counts=True)
df = turnout_by_year_key(voters, "univ_housing_name")
df.loc[2011, "shaw hl"]
voters[lesley_housing_idx['all']].loc[2012]['univ_housing_name']